started: AL26Apr2019
last updated: AL17Sep2019

Summary

Inputs:

A number of clear ethinic outliers have not been picked up by WECARE-NFE-only PCA.

So, more outliers have been added manually, basing on the joined plot with 1KGP, expanding 26 to 48 outliers

Start_section

Sys.time()
## [1] "2019-09-17 18:12:06 BST"
rm(list=ls())
graphics.off()

library(knitr)
## Warning: package 'knitr' was built under R version 3.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.2
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
base_folder="/Users/alexey/Documents/wecare/ampliseq/v04_ampliseq_nfe/s13_ampliseq-nfe_only_PCA/s03_explore_PCA_plots_exclude_outliers"

opts_knit$set(root.dir = base_folder)

options(stringsAsFactors = F)
options(warnPartialMatchArgs = T, 
        warnPartialMatchAttr = T, 
        warnPartialMatchDollar = T)

#options(error = browser()) # Type Q or c to exit, drop browser level
# https://support.rstudio.com/hc/en-us/articles/200713843?version=1.1.456&mode=desktop
# https://stackoverflow.com/questions/13052522/how-to-leave-the-r-browser-mode-in-the-console-window/13052588 

Read_data

Read data for joined PCA on Ampliseq and 1KG

load(paste(base_folder, "s01_add_PCs.RData", sep="/"))

source_folder="/Users/alexey/Documents/wecare/ampliseq/v04_ampliseq_nfe/s12_joined_ampliseq-nfe_1kgp_PCA/s06_explore_joined_PCA_plots"
load(paste(source_folder, "s01_joined_PCA_plots_293_3216_not_rare_not_in_LD.RData", sep="/"))
base_folder="/Users/alexey/Documents/wecare/ampliseq/v04_ampliseq_nfe/s13_ampliseq-nfe_only_PCA/s03_explore_PCA_plots_exclude_outliers"

# Clean-up
rm(source_folder, eigenphen_nfe_kgen.df)

Check data

# List of objects
ls()
## [1] "base_folder"                "eigenphen_ampliseq_kgen.df" "genotypes.mx"               "joined_outliers"            "pc1_th"                     "pc2_th"                     "phenotypes.df"              "variants.df"
# Expected number of samples in eigenvectors
2504+515
## [1] 3019
dim(eigenphen_ampliseq_kgen.df)
## [1] 3019    7
# Dimentions of objects
dim(genotypes.mx)
## [1] 1838  712
dim(variants.df)
## [1] 1838   65
dim(phenotypes.df)
## [1] 712  37

Add ampliseq-nfe outliers to joined data

# Get IDs of the ampliseq-nfe-only outliers
ampliseq_nfe_outliers <- phenotypes.df[phenotypes.df$pc_outlier,"long_ids"]
length(ampliseq_nfe_outliers)
## [1] 26
# All ampliseq outliers are within the ethnic outliers detected in joined PCA
sum(ampliseq_nfe_outliers %in% joined_outliers)
## [1] 26
# Add the column to data frame (for plotting)
ampliseq_nfe_only_outliers <- eigenphen_ampliseq_kgen.df$sample %in% ampliseq_nfe_outliers
eigenphen_ampliseq_kgen.df <- data.frame(eigenphen_ampliseq_kgen.df,ampliseq_nfe_only_outliers)
sum(eigenphen_ampliseq_kgen.df$ampliseq_nfe_only_outliers)
## [1] 26
# Clean-up
rm(ampliseq_nfe_only_outliers)

Make PCA plot

http://www.sthda.com/english/wiki/ggplot2-point-shapes

table(eigenphen_ampliseq_kgen.df$group)
## 
##    AFR    AMR    EAS    EUR    SAS WECARE 
##    661    347    504    503    489    515
# Prepare vector fr colour scale
myColours <- c("EUR"="BLUE", "AFR"="YELLOW", "AMR"="GREEN",
               "SAS"="GREY", "EAS"="PINK", 
               "WECARE"="RED")

myColourScale <- scale_colour_manual(values=myColours)

# Static plot
ggplot(eigenphen_ampliseq_kgen.df, aes(PC1,PC2)) + 
  geom_point(aes(col=group, shape=ampliseq_nfe_only_outliers)) +
  labs(title="PC1 vs PC2", x="PC1", y="PC2") +
  scale_shape_manual(values=c(16,4)) +
  geom_vline(xintercept=pc1_th, linetype="dashed", size=0.5) +
  geom_hline(yintercept=pc2_th, linetype="dashed", size=0.5) +
  myColourScale

# Interactive plot
plotly_group <- factor(eigenphen_ampliseq_kgen.df$group,
  levels=c("AFR","AMR","EAS","SAS","EUR","WECARE"))

g <- ggplot(eigenphen_ampliseq_kgen.df, aes(PC1,PC2)) + 
  geom_point(aes(col=plotly_group, shape=ampliseq_nfe_only_outliers, text=sample)) +
  labs(title="PC1 vs PC2 (manual thresholds)", x="PC1", y="PC2") +
  scale_shape_manual(values=c(16,4)) + 
  geom_vline(xintercept=pc1_th, linetype="dashed", size=0.5) +
  geom_hline(yintercept=pc2_th, linetype="dashed", size=0.5) +
  myColourScale 
## Warning: Ignoring unknown aesthetics: text
ggplotly(g, tooltip="text") # By default the tooltip would also show coordinates 
## Warning in dev_fun(file = tempfile(), width = width %||% 640, height = height %||% : partial argument match of 'file' to 'filename'
# Clean-up
rm(myColours, myColourScale, plotly_group, g)

Add manually selected outliers to phenotypes table

joined_pca_outlier <- phenotypes.df$long_ids %in% joined_outliers
phenotypes.df <- data.frame(phenotypes.df, joined_pca_outlier)

rm(eigenphen_ampliseq_kgen.df, ampliseq_nfe_outliers, 
   joined_outliers, joined_pca_outlier, pc1_th, pc2_th)

Save results

save.image(paste(base_folder, "s02_explore_ethnicity_of_outliers.RData", sep="/"))

Final_section

ls()
## [1] "base_folder"   "genotypes.mx"  "phenotypes.df" "variants.df"
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] plotly_4.9.0  ggplot2_3.2.0 dplyr_0.8.1   knitr_1.23   
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.1        later_0.8.0       pillar_1.4.1      compiler_3.5.1    tools_3.5.1       digest_0.6.19     jsonlite_1.6      evaluate_0.14     tibble_2.1.3      gtable_0.3.0      viridisLite_0.3.0 pkgconfig_2.0.2   rlang_0.3.4       shiny_1.3.2       crosstalk_1.0.0   yaml_2.2.0        xfun_0.7          withr_2.1.2       stringr_1.4.0     httr_1.4.0        htmlwidgets_1.3   grid_3.5.1        tidyselect_0.2.5  glue_1.3.1        data.table_1.12.2 R6_2.4.0          rmarkdown_1.13    purrr_0.3.2       tidyr_0.8.3       magrittr_1.5      promises_1.0.1    scales_1.0.0      htmltools_0.3.6   assertthat_0.2.1  xtable_1.8-4      mime_0.7          colorspace_1.4-1  httpuv_1.5.1      labeling_0.3      stringi_1.4.3     lazyeval_0.2.2    munsell_0.5.0     crayon_1.3.4
Sys.time()
## [1] "2019-09-17 18:12:09 BST"